{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "dd08c6f4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f4bf5ada2fbe45aba84156225805d715",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Fetching 21 files:   0%|          | 0/21 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'/home/mesolitica/stt/Classification-Speech-Instructions'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from huggingface_hub import snapshot_download\n",
    "\n",
    "snapshot_download(\n",
    "    repo_id=\"mesolitica/Classification-Speech-Instructions\",\n",
    "    repo_type='dataset',\n",
    "    allow_patterns=\"data/*.parquet\",\n",
    "    local_dir=\"./Classification-Speech-Instructions\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6bd46a01",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Classification-Speech-Instructions/data/gender_age-00003-of-00005.parquet',\n",
       " 'Classification-Speech-Instructions/data/gender_age-00002-of-00005.parquet',\n",
       " 'Classification-Speech-Instructions/data/gender_age-00004-of-00005.parquet',\n",
       " 'Classification-Speech-Instructions/data/gender_age-00000-of-00005.parquet',\n",
       " 'Classification-Speech-Instructions/data/gender_age-00001-of-00005.parquet']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "files = glob('Classification-Speech-Instructions/data/gender_age-*.parquet')\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bdaec324",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19790.98it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19764.96it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19911.51it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 19924.42it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 19953.65it/s]\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from tqdm import tqdm\n",
    "\n",
    "ages = []\n",
    "for f in files:\n",
    "    f_only = os.path.split(f)[1].replace('.parquet', '')\n",
    "    df = pd.read_parquet(f)\n",
    "    for i in tqdm(range(len(df))):\n",
    "        d = json.loads(df['metadata'].iloc[i])\n",
    "        new_f = os.path.join('Classification-Speech-Instructions-audio', \n",
    "                                 f'{f_only}-{i}.mp3')\n",
    "        if not os.path.exists(new_f):\n",
    "            continue\n",
    "        m = df.iloc[i].to_dict()\n",
    "        m.pop('audio_filename')\n",
    "        ages.append({\n",
    "            'audio_filename': new_f,\n",
    "            'metadata': json.dumps(m),\n",
    "            'answer': d['age'],\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e32a92ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"['twenties', 'fifties', 'nineties', 'teens', 'fourties', 'sixties', 'thirties', 'seventies', 'eighties']\""
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unique_ages = str(list(set([r['answer'] for r in ages])))\n",
    "unique_ages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d04d2fe2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "questions = [\n",
    "    'given the labels\\n{labels}\\n\\nwhat is the label for the audio',\n",
    "    'what is the label for audio\\n\\nthe labels: {labels}'\n",
    "]\n",
    "\n",
    "for i in range(len(ages)):\n",
    "    ages[i]['question'] = random.choice(questions).format(labels = unique_ages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "aa3cb8da",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19745.60it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19859.80it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19793.89it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 20067.53it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 20257.01it/s]\n"
     ]
    }
   ],
   "source": [
    "genders = []\n",
    "for f in files:\n",
    "    f_only = os.path.split(f)[1].replace('.parquet', '')\n",
    "    df = pd.read_parquet(f)\n",
    "    for i in tqdm(range(len(df))):\n",
    "        d = json.loads(df['metadata'].iloc[i])\n",
    "        new_f = os.path.join('Classification-Speech-Instructions-audio', \n",
    "                                 f'{f_only}-{i}.mp3')\n",
    "        if not os.path.exists(new_f):\n",
    "            continue\n",
    "        m = df.iloc[i].to_dict()\n",
    "        m.pop('audio_filename')\n",
    "        genders.append({\n",
    "            'audio_filename': new_f,\n",
    "            'metadata': json.dumps(m),\n",
    "            'answer': d['gender'],\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "761ba010",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"['male_masculine', 'female_feminine']\""
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unique_genders = str(list(set([r['answer'] for r in genders])))\n",
    "unique_genders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e557664c",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(genders)):\n",
    "    genders[i]['question'] = random.choice(questions).format(labels = unique_genders)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "e0f5b513",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19679.12it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19763.98it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19363.36it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 20083.36it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 19702.83it/s]\n"
     ]
    }
   ],
   "source": [
    "locales = []\n",
    "for f in files:\n",
    "    f_only = os.path.split(f)[1].replace('.parquet', '')\n",
    "    df = pd.read_parquet(f)\n",
    "    for i in tqdm(range(len(df))):\n",
    "        d = json.loads(df['metadata'].iloc[i])\n",
    "        new_f = os.path.join('Classification-Speech-Instructions-audio', \n",
    "                                 f'{f_only}-{i}.mp3')\n",
    "        if not os.path.exists(new_f):\n",
    "            continue\n",
    "        m = df.iloc[i].to_dict()\n",
    "        m.pop('audio_filename')\n",
    "        if len(d['locale']) < 2:\n",
    "            continue\n",
    "        locales.append({\n",
    "            'audio_filename': new_f,\n",
    "            'metadata': json.dumps(m),\n",
    "            'answer': d['locale'],\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "825d29d4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"['tr', 'as', 'zgh', 'ur', 'sl', 'el', 'ca', 'de', 'ar', 'is', 'ab', 'tt', 'pl', 'en', 'ro', 'ug', 'oc', 'da', 'ko', 'sv-SE', 'kmr', 'ne-NP', 'cs', 'az', 'skr', 'ba', 'pt', 'dv', 'ky', 'pa-IN', 'ia', 'rw', 'tk', 'bg', 'zh-CN', 'bn', 'eu', 'myv', 'nhi', 'mdf', 'eo', 'sq', 'tok', 'ka', 'tw', 'et', 'mrj', 'nn-NO', 'ti', 'sw', 'ps', 'lg', 'ml', 'rm-sursilv', 'th', 'nan-tw', 'hsb', 'fi', 'sat', 'yue', 'cnh', 'cy', 'uk', 'fa', 'it', 'hy-AM', 'yo', 'rm-vallader', 'ltg', 'sr', 'mt', 'br', 'mn', 'sc', 'hi', 'ha', 'lv', 'gn', 'sk', 'te', 'cv', 'hu', 'kab', 'gl', 'he', 'lo', 'ckb', 'kk', 'ta', 'mr', 'ga-IE', 'ig', 'lt', 'ast', 'vi', 'or', 'af', 'mhr', 'be', 'am', 'nl', 'es', 'ru', 'fy-NL', 'id', 'fr', 'sah', 'uz', 'mk', 'os', 'lij', 'ja', 'dyu']\""
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unique_locales = str(list(set([r['answer'] for r in locales])))\n",
    "unique_locales"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "db309e91",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(locales)):\n",
    "    locales[i]['question'] = random.choice(questions).format(labels = unique_locales)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0cbc2685",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(48767, 48767, 48767)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ages), len(genders), len(locales)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "7f093b16",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "dataset = Dataset.from_list(ages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "8da34f1c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8ab3e2c521234fb8a6deb18241a11a5d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "97bb2ffa50c74788a4c28a7b2e89578e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "80035bc407a346a8b2b6369e8925356e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/11.8M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8e46a5307f9848fdb92514340885cf65",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/2.26k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/4abdc3f0b7fe767462e04f5d0c750fec50754d3f', commit_message='Upload dataset', commit_description='', oid='4abdc3f0b7fe767462e04f5d0c750fec50754d3f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'age')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "482255a8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "51c89108e3f842098101720c8a23c5d3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1fbaa93e548d48ea9801b29c0985be25",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bf82980a49024ef6b18552bbcb0253a1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/11.8M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2b37ad1608a04fe290152c74ac81ef1b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/2.36k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/1e3a505b97cecd946c1358f16771c2b13e281fd0', commit_message='Upload dataset', commit_description='', oid='1e3a505b97cecd946c1358f16771c2b13e281fd0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = Dataset.from_list(genders)\n",
    "dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'gender')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2daaf351",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b8b57ac71ac74e3b85cfa198a5986c37",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bda4f942a29940fb85b693a411302be8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fab0c782b6284939b5879bb3b7306f45",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/12.0M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset = Dataset.from_list(locales)\n",
    "dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'language')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
